library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(boot)
library(ggplot2)
library(readxl)
library(plotly)
## Warning: package 'plotly' was built under R version 4.3.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(stringr)
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.3.3
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths

EDA Order

EDA

data <- read_excel('data/Sample_Customore_Order raw.xlsx',  sheet = 'raw')
glimpse(data)
## Rows: 61,728
## Columns: 11
## $ customer_unique_id    <dbl> 1, 2, 3, 4, 5, 6, 7, 7, 8, 9, 10, 11, 12, 13, 14…
## $ order_id              <chr> "A000000001", "A000000002", "A000000003", "A0000…
## $ item_quantity         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ created_day           <dttm> 2019-07-01, 2019-07-01, 2019-07-01, 2019-07-01,…
## $ onsite_original_price <dbl> 338000, 175000, 520000, 238000, 159000, 135000, …
## $ selling_price         <dbl> 259000, 139000, 359000, 185000, 129000, 105000, …
## $ shipping_fee          <dbl> 3000, 0, 14927, 29645, 14927, 0, 3000, 3000, 402…
## $ voucher_platform      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ voucher_seller        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ payment_method        <chr> "Airpay GIRO", "Cash on Delivery", "Cybersource"…
## $ order_status          <chr> "CANCELLED", "COMPLETED", "COMPLETED", "COMPLETE…
duplicate_rows <- data %>% dplyr::filter(duplicated(.))

# View the duplicate rows
glimpse(duplicate_rows)
## Rows: 1,897
## Columns: 11
## $ customer_unique_id    <dbl> 26, 54, 84, 92, 137, 156, 214, 240, 341, 483, 64…
## $ order_id              <chr> "A000000027", "A000000055", "A000000086", "A0000…
## $ item_quantity         <dbl> 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ created_day           <dttm> 2019-07-01, 2019-07-01, 2019-07-01, 2019-07-01,…
## $ onsite_original_price <dbl> 215000, 7000, 149000, 370000, 298000, 59000, 175…
## $ selling_price         <dbl> 179000, 7000, 119000, 289000, 259000, 45000, 149…
## $ shipping_fee          <dbl> 39974, 39000, 10500, 46426, 51425, 0, 0, 0, 0, 0…
## $ voucher_platform      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ voucher_seller        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ payment_method        <chr> "Cash on Delivery", "Cash on Delivery", "Cash on…
## $ order_status          <chr> "CANCELLED", "COMPLETED", "COMPLETED", "COMPLETE…
data <- data |>  distinct()
glimpse(data)
## Rows: 59,831
## Columns: 11
## $ customer_unique_id    <dbl> 1, 2, 3, 4, 5, 6, 7, 7, 8, 9, 10, 11, 12, 13, 14…
## $ order_id              <chr> "A000000001", "A000000002", "A000000003", "A0000…
## $ item_quantity         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ created_day           <dttm> 2019-07-01, 2019-07-01, 2019-07-01, 2019-07-01,…
## $ onsite_original_price <dbl> 338000, 175000, 520000, 238000, 159000, 135000, …
## $ selling_price         <dbl> 259000, 139000, 359000, 185000, 129000, 105000, …
## $ shipping_fee          <dbl> 3000, 0, 14927, 29645, 14927, 0, 3000, 3000, 402…
## $ voucher_platform      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ voucher_seller        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ payment_method        <chr> "Airpay GIRO", "Cash on Delivery", "Cybersource"…
## $ order_status          <chr> "CANCELLED", "COMPLETED", "COMPLETED", "COMPLETE…
data_total_price <- data |> mutate(total_price = selling_price + shipping_fee,
                  .before = 1)
data_sum <- data_total_price
data_sum <- data_sum |> group_by(created_day) |> summarise(total_price = sum(total_price))
glimpse(data_sum)
## Rows: 27
## Columns: 2
## $ created_day <dttm> 2019-07-01, 2019-07-02, 2019-07-03, 2019-07-04, 2019-07-0…
## $ total_price <dbl> 146796201, 134045071, 176594494, 130569104, 177917512, 422…
highlight_date <- as.Date(c("2019-07-12", "2019-07-21"))

# Create the ggplot2 object
p <- ggplot(data_sum, aes(x=created_day, y=total_price)) +
  geom_line(color="steelblue") + 
  geom_point(data = subset(data_sum, created_day %in% highlight_date),
             aes(x = created_day, y = total_price),
             color = "red", size = 2, shape = 21, fill = "red", show.legend = TRUE) + 
  xlab("") +
  geom_text(data = subset(data_sum, created_day %in% highlight_date),
            aes(x = created_day, y = total_price, 
                label = paste("\n\nDate:", created_day, "   Total:", total_price)),
            vjust = -1.5, hjust = 1.1, color = "black", size = 3) +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

p <- ggplotly(p)
p
data_mean <- data_total_price
data_mean <- data_mean |> group_by(created_day) |> summarise(mean_price = mean(total_price))
glimpse(data_mean)
## Rows: 27
## Columns: 2
## $ created_day <dttm> 2019-07-01, 2019-07-02, 2019-07-03, 2019-07-04, 2019-07-0…
## $ mean_price  <dbl> 235627.9, 245055.0, 229641.7, 232329.4, 224077.5, 227207.6…
p <- ggplot(data_mean, aes(x=created_day, y=mean_price)) +
  geom_line( color="steelblue") +
  geom_point(data = subset(data_mean, created_day %in% highlight_date),
             aes(x = created_day, y = mean_price),
             color = "red", size = 2, shape = 21, fill = "red") +
  xlab("") +
  geom_text(data = subset(data_mean, created_day %in% highlight_date),
            aes(x = created_day, y = mean_price, 
                label = paste("\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tDate:", created_day, "\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tMean:", round(mean_price))),
            vjust = -1.5, hjust = 1.1, color = "black", size = 3) +
  theme(axis.text.x=element_text(angle=60, hjust=1))
p <- ggplotly(p)
p
data_factor <- data
data_factor$payment_method <- factor(data_factor$payment_method)
data_factor$order_status <- factor(data_factor$order_status)

group_payment_method <- function(method) {
  case_when(
    str_detect(method, "VN Airpay Ibanking") ~ "VN Airpay Ibanking",
    TRUE ~ as.character(method)
  )
}
data_factor <- data_factor |> mutate(payment_method = sapply(payment_method, group_payment_method))
data_factor$payment_method <- factor(data_factor$payment_method)
data_factor$item_quantity <- factor(data_factor$item_quantity)
data_factor <- data_factor |> mutate(total_price = selling_price + shipping_fee,
                  .before = 1)
p <- ggplot_count_plot <- ggplot(data_factor, aes(x = payment_method)) +
  geom_bar(fill = "steelblue") +
  labs(title = "Count of Payment Methods",
       x = "Payment Method",
       y = "Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
p <- ggplotly(p)
p
p <- ggplot_count_plot <- ggplot(data_factor, aes(x = payment_method, fill = order_status)) +
  geom_bar() +
  scale_fill_manual(values = c("COMPLETED" = "steelblue", "CANCELLED" = "tomato")) +
  labs(title = "Count of Payment Methods by Order Status",
       x = "Payment Method",
       y = "Count",
       fill = "Order Status") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
p <- ggplotly(p)
p
data_factor1 <- data_factor |> dplyr::filter(data_factor$payment_method != "Cash on Delivery")
p <- ggplot_count_plot <- ggplot(data_factor1, aes(x = payment_method, fill = order_status)) +
  geom_bar() +
  scale_fill_manual(values = c("COMPLETED" = "steelblue", "CANCELLED" = "tomato")) +
  labs(title = "Count of Payment Methods by Order Status (excluded COD)",
       x = "Payment Method",
       y = "Count",
       fill = "Order Status") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
p <- ggplotly(p)
p
p <- ggplot_count_plot <- ggplot(data_factor, aes(x = item_quantity)) +
  geom_bar(fill = "steelblue") +
  labs(title = "Count of Item Quantity",
       x = "Item Quantity",
       y = "Count") +
  theme(axis.text.x = element_text(angle = 0, hjust = 1))
p <- ggplotly(p)
p
data_sum_orderid <- data_factor
data_sum_orderid <- data_sum_orderid |> group_by(customer_unique_id, order_id, order_status, payment_method) |>  summarise(total_price = sum(total_price), .groups = 'drop')
glimpse(data_sum_orderid)
## Rows: 37,835
## Columns: 5
## $ customer_unique_id <dbl> 1, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 9, 10, 10…
## $ order_id           <chr> "A000000001", "A000004553", "A000000002", "A0000000…
## $ order_status       <fct> CANCELLED, COMPLETED, COMPLETED, COMPLETED, COMPLET…
## $ payment_method     <fct> Airpay GIRO, Airpay GIRO, Cash on Delivery, Cyberso…
## $ total_price        <dbl> 262000, 232000, 234000, 373927, 214645, 143927, 150…
data_sum_order_complete <- data_sum_orderid |> dplyr::filter(data_sum_orderid$order_status == "COMPLETED")
# Define the two payment methods to highlight
highlight_methods <- c("Airpay GIRO", "Cash on Delivery")


p <- ggplot(data_sum_order_complete, aes(x = payment_method, y = total_price)) +
  geom_jitter(aes(color = ifelse(payment_method %in% highlight_methods, as.character(payment_method), "Other")),
              width = 0.2, height = 0) + 
  scale_color_manual(values = c(
    "Airpay GIRO" = "steelblue",
    "Cash on Delivery" = "tomato",
    "Other" = "grey"
  )) +
  labs(title = "Scatter Plot of Payment Method vs Revenue",
       x = "Payment Method",
       y = "Revenue",
       color = "Payment Method") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) 

p

data_summary <- data_sum_orderid %>%
  group_by(order_status) %>%
  summarize(count = n()) %>%
  mutate(percentage = count / sum(count) * 100)  # Calculate percentage

# Create a pie chart for the ratio of completed to canceled orders with custom colors
p <- ggplot(data_summary, aes(x = "", y = count, fill = order_status)) +
  geom_bar(width = 1, stat = "identity") +
  coord_polar(theta = "y") +
  labs(title = "Ratio of Completed to Canceled Orders") +
  theme_void() +
  theme(legend.title = element_blank()) +
  geom_text(aes(label = paste0(round(percentage, 1), "%")),
            position = position_stack(vjust = 0.5),
            color = "black", 
            size = 4,         
            fontface = "bold") +
  scale_fill_manual(values = c("COMPLETED" = "steelblue", "CANCELLED" = "tomato"))


# Print the interactive plotly plot
p

AB testing

data_sum_test <- data_total_price
data_sum_test <- data_sum_test |> group_by(customer_unique_id, order_id, order_status, shipping_fee) |>  summarise(shipping_fee = unique(shipping_fee), .groups = 'drop')
glimpse(data_sum_test)
## Rows: 37,835
## Columns: 4
## $ customer_unique_id <dbl> 1, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 9, 10, 10…
## $ order_id           <chr> "A000000001", "A000004553", "A000000002", "A0000000…
## $ order_status       <chr> "CANCELLED", "COMPLETED", "COMPLETED", "COMPLETED",…
## $ shipping_fee       <dbl> 3000, 3000, 0, 14927, 29645, 14927, 0, 3000, 40297,…
data_sum_order_complete_test <- data_sum_test |> dplyr::filter(data_sum_test$order_status == "COMPLETED")
glimpse(data_sum_order_complete_test)
## Rows: 30,273
## Columns: 4
## $ customer_unique_id <dbl> 1, 2, 3, 4, 6, 7, 8, 8, 8, 8, 8, 10, 10, 11, 15, 16…
## $ order_id           <chr> "A000004553", "A000000002", "A000000003", "A0000000…
## $ order_status       <chr> "COMPLETED", "COMPLETED", "COMPLETED", "COMPLETED",…
## $ shipping_fee       <dbl> 3000, 0, 14927, 29645, 0, 3000, 40297, 0, 35641, 63…
data_sum_order_cancel_test <- data_sum_test |> dplyr::filter(data_sum_test$order_status == "CANCELLED")
glimpse(data_sum_order_cancel_test)
## Rows: 7,562
## Columns: 4
## $ customer_unique_id <dbl> 1, 5, 8, 9, 11, 12, 12, 12, 13, 14, 19, 26, 34, 41,…
## $ order_id           <chr> "A000000001", "A000000005", "A000000008", "A0000000…
## $ order_status       <chr> "CANCELLED", "CANCELLED", "CANCELLED", "CANCELLED",…
## $ shipping_fee       <dbl> 3000, 14927, 40297, 0, 29980, 32100, 32150, 32100, …
data_sum_test |> dplyr::filter(data_sum_test$customer_unique_id == 8)
## # A tibble: 6 × 4
##   customer_unique_id order_id   order_status shipping_fee
##                <dbl> <chr>      <chr>               <dbl>
## 1                  8 A000000008 CANCELLED           40297
## 2                  8 A000000318 COMPLETED           40297
## 3                  8 A000000668 COMPLETED               0
## 4                  8 A000002917 COMPLETED           35641
## 5                  8 A000014714 COMPLETED           63045
## 6                  8 A000031049 COMPLETED             500
ab_test <- merge(data_sum_order_complete_test, data_sum_order_cancel_test, by = "customer_unique_id")
glimpse(ab_test)
## Rows: 4,788
## Columns: 7
## $ customer_unique_id <dbl> 1, 8, 8, 8, 8, 8, 11, 19, 26, 34, 41, 59, 66, 66, 6…
## $ order_id.x         <chr> "A000004553", "A000014714", "A000000318", "A0000310…
## $ order_status.x     <chr> "COMPLETED", "COMPLETED", "COMPLETED", "COMPLETED",…
## $ shipping_fee.x     <dbl> 3000, 63045, 40297, 500, 0, 35641, 0, 3000, 3000, 0…
## $ order_id.y         <chr> "A000000001", "A000000008", "A000000008", "A0000000…
## $ order_status.y     <chr> "CANCELLED", "CANCELLED", "CANCELLED", "CANCELLED",…
## $ shipping_fee.y     <dbl> 3000, 40297, 40297, 40297, 40297, 40297, 29980, 300…
test1_t <- ab_test
test2_t <- ab_test
test1_t[5:7] <- NULL 
glimpse(test1_t)
## Rows: 4,788
## Columns: 4
## $ customer_unique_id <dbl> 1, 8, 8, 8, 8, 8, 11, 19, 26, 34, 41, 59, 66, 66, 6…
## $ order_id.x         <chr> "A000004553", "A000014714", "A000000318", "A0000310…
## $ order_status.x     <chr> "COMPLETED", "COMPLETED", "COMPLETED", "COMPLETED",…
## $ shipping_fee.x     <dbl> 3000, 63045, 40297, 500, 0, 35641, 0, 3000, 3000, 0…
test2_t[2:4] <- NULL 
glimpse(test2_t)
## Rows: 4,788
## Columns: 4
## $ customer_unique_id <dbl> 1, 8, 8, 8, 8, 8, 11, 19, 26, 34, 41, 59, 66, 66, 6…
## $ order_id.y         <chr> "A000000001", "A000000008", "A000000008", "A0000000…
## $ order_status.y     <chr> "CANCELLED", "CANCELLED", "CANCELLED", "CANCELLED",…
## $ shipping_fee.y     <dbl> 3000, 40297, 40297, 40297, 40297, 40297, 29980, 300…

We consider the question: “Does the shipping fee of an order influence whether a customer accepts or cancels it?”

We will focus on customers who meet both of the following conditions: “have canceled at least one order” and “have completed at least one order.”

test1_t <- test1_t  |>  distinct()
test2_t <- test2_t  |>  distinct()
test1_sum_t <- test1_t |> group_by(customer_unique_id) |> summarise(shipping_fee.x = sum(shipping_fee.x))
test2_sum_t <- test2_t |> group_by(customer_unique_id) |> summarise(shipping_fee.y = sum(shipping_fee.y))
ab_tess_1 <- merge(test1_sum_t, test2_sum_t, by = "customer_unique_id")
ab_tess_1 <- ab_tess_1 |> rename(shipping_fee_completed = shipping_fee.x) |> rename(shipping_fee_cancelled = shipping_fee.y)
glimpse(ab_tess_1)
## Rows: 3,004
## Columns: 3
## $ customer_unique_id     <dbl> 1, 8, 11, 19, 26, 34, 41, 59, 66, 67, 68, 73, 9…
## $ shipping_fee_completed <dbl> 3000, 139483, 0, 3000, 3000, 0, 28659, 0, 2000,…
## $ shipping_fee_cancelled <dbl> 3000, 40297, 29980, 3000, 39974, 16050, 28659, …
mean_complete <- mean(ab_tess_1$shipping_fee_completed )
mean_cancel <- mean(ab_tess_1$shipping_fee_cancelled)
print(sprintf("Mean of price for completed order: %.10f", mean_complete))
## [1] "Mean of price for completed order: 12897.2749667111"
print(sprintf("Mean of price for cancelled order: %.10f", mean_cancel))
## [1] "Mean of price for cancelled order: 16496.6394806924"
data_plot <- tibble(
  order_status = c("Completed", "Canceled"),
  mean_shipping_fee = c(mean_complete, mean_cancel)
)
# Draw a bar plot using ggplot
p <- ggplot(data_plot, aes(x = order_status, y = mean_shipping_fee, fill = order_status)) +
  geom_bar(stat = "identity", width = 0.5) +
  labs(title = "Mean Shipping Fee by Order Status (for a selected group of customer)",
       x = "Order Status",
       y = "Mean Shipping Fee") +
  theme_minimal() +
  scale_fill_manual(values = c("Completed" = "steelblue", "Canceled" = "tomato")) +
  theme(text = element_text(size = 10, face = "bold"))

# Print the plot
print(p)

For the selected customers, we observe that the average shipping fee for completed orders is lower than that of the orders they canceled.

A hypothesis can be proposed: “The shipping fee for canceled orders is higher than that for successful orders.” Therefore, we need to test the following null and alternative hypotheses:

Null hypothesis: \(H_0: \mu_0 = \mu_1\) Alternative hypothesis: \(H_1: \mu_1 < \mu_0\)

Where \(\mu_0\) is the average shipping fee of canceled orders, and \(\mu_1\) is the average shipping fee of completed orders.

If \(H_0\) is true, the difference in shipping fees between successful and canceled orders is purely a result of chance and is not statistically significant. To test this hypothesis, we will use a permutation test, and the p-value will be calculated for the left-tailed test.

perm_fun <- function(x1, x0, R) 
{
  n1 <- length(x1)
  n0 <- length(x0)
  n <- n1 + n0
  mean_diff <- numeric(R)
  combined_data <- c(x1, x0)
  
  for (i in 1:R) {
    idx_1 <- sample(x = 1:n, size = n1)
    idx_0 <- setdiff(1:n, idx_1)
    mean_diff[i] <- mean(combined_data[idx_1]) - mean(combined_data[idx_0])
  }
  
  return(mean_diff)
}

x1 <- ab_tess_1$shipping_fee_completed
x0 <- ab_tess_1$shipping_fee_cancelled

# Set the number of permutations
R <- 10000

# Run the permutation test
set.seed(42)
mean_diffs <- perm_fun(x1, x0, R)
ggplot(data = tibble(perm_diffs = mean_diffs), aes(x = perm_diffs)) +
  geom_histogram(bins = 10, fill = "gray80", color = "black") +
  labs(x = "Prices differences", y = "Frequency") +
  theme_bw()

result <- mean(mean_diffs < (mean_complete - mean_cancel))
# Print the result
print(result)
## [1] 0

Since the p-value = 0 is smaller than both significance levels of 0.05 and 0.01, we reject the null hypothesis. This indicates that there is strong statistical evidence to conclude that the difference in shipping fees between completed and canceled orders is not due to random chance.

Given the statistical significance, it suggests that higher shipping fees are indeed associated with a higher likelihood of order cancellation. In other words, the evidence supports the idea that shipping fees impact whether customers complete or cancel their orders.

EDA Traffic

EDA

traffic <- read_excel('data/Sample_Customore_Traffic raw.xlsx')
glimpse(traffic)
## Rows: 198
## Columns: 10
## $ `Source / Medium`           <chr> "google / cpc", "youtube / social", "(dire…
## $ Users                       <dbl> 407950, 77785, 64653, 48721, 27718, 14373,…
## $ `New Users`                 <dbl> 344502, 55537, 60181, 32781, 17774, 10110,…
## $ Sessions                    <dbl> 723208, 163447, 108534, 176662, 42464, 176…
## $ `Bounce Rate`               <dbl> 0.6219013, 0.7881148, 0.5532644, 0.8440015…
## $ `Pages / Session`           <dbl> 3.556041, 2.244324, 4.120340, 1.858996, 2.…
## $ `Avg. Session Duration`     <dbl> 176.02568, 89.94474, 200.42917, 73.20609, …
## $ `Ecommerce Conversion Rate` <dbl> 0.009078992, 0.005855109, 0.009655960, 0.0…
## $ Transactions                <dbl> 6566, 957, 1048, 500, 508, 127, 103, 43, 7…
## $ Revenue                     <dbl> 6656088123.8, 842907936.0, 1040851446.9, 5…
traffic <- traffic |> janitor::clean_names()
traffic <- na.omit(traffic)
glimpse(traffic)
## Rows: 197
## Columns: 10
## $ source_medium             <chr> "google / cpc", "youtube / social", "(direct…
## $ users                     <dbl> 407950, 77785, 64653, 48721, 27718, 14373, 3…
## $ new_users                 <dbl> 344502, 55537, 60181, 32781, 17774, 10110, 1…
## $ sessions                  <dbl> 723208, 163447, 108534, 176662, 42464, 17631…
## $ bounce_rate               <dbl> 0.6219013, 0.7881148, 0.5532644, 0.8440015, …
## $ pages_session             <dbl> 3.556041, 2.244324, 4.120340, 1.858996, 2.98…
## $ avg_session_duration      <dbl> 176.02568, 89.94474, 200.42917, 73.20609, 14…
## $ ecommerce_conversion_rate <dbl> 0.009078992, 0.005855109, 0.009655960, 0.002…
## $ transactions              <dbl> 6566, 957, 1048, 500, 508, 127, 103, 43, 7, …
## $ revenue                   <dbl> 6656088123.8, 842907936.0, 1040851446.9, 534…
traffic_num <- traffic[ -c(1) ]
cor_matrix <- cor(traffic_num)


melted_cor <- melt(cor_matrix)


ggplot(melted_cor, aes(x=Var1, y=Var2, fill=value)) +
  geom_tile() +
  geom_text(aes(label=round(value, 2)), color="black", size=3) +
  scale_fill_gradient2(low = "tomato", high = "tomato", mid = "white", midpoint = 0, 
                       name = "Correlation") +
  theme_minimal() +
  labs(x = "Variables", y = "Variables", title = "Correlation Plot") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

traffic1 <- traffic |> mutate(percent_of_new_user = (new_users / users),
                  .before = 4)
glimpse(traffic1)
## Rows: 197
## Columns: 11
## $ source_medium             <chr> "google / cpc", "youtube / social", "(direct…
## $ users                     <dbl> 407950, 77785, 64653, 48721, 27718, 14373, 3…
## $ new_users                 <dbl> 344502, 55537, 60181, 32781, 17774, 10110, 1…
## $ percent_of_new_user       <dbl> 0.8444711, 0.7139808, 0.9308307, 0.6728310, …
## $ sessions                  <dbl> 723208, 163447, 108534, 176662, 42464, 17631…
## $ bounce_rate               <dbl> 0.6219013, 0.7881148, 0.5532644, 0.8440015, …
## $ pages_session             <dbl> 3.556041, 2.244324, 4.120340, 1.858996, 2.98…
## $ avg_session_duration      <dbl> 176.02568, 89.94474, 200.42917, 73.20609, 14…
## $ ecommerce_conversion_rate <dbl> 0.009078992, 0.005855109, 0.009655960, 0.002…
## $ transactions              <dbl> 6566, 957, 1048, 500, 508, 127, 103, 43, 7, …
## $ revenue                   <dbl> 6656088123.8, 842907936.0, 1040851446.9, 534…
traffic1 <- traffic1[order(-traffic1$revenue), ][1:30,]
top_10_users <- traffic1[1:10,]

# Display the result
print(top_10_users)
## # A tibble: 10 × 11
##    source_medium        users new_users percent_of_new_user sessions bounce_rate
##    <chr>                <dbl>     <dbl>               <dbl>    <dbl>       <dbl>
##  1 google / cpc        407950    344502               0.844   723208       0.622
##  2 (direct) / (none)    64653     60181               0.931   108534       0.553
##  3 youtube / social     77785     55537               0.714   163447       0.788
##  4 facebook / social    48721     32781               0.673   176662       0.844
##  5 youtube.com / refe…  27718     17774               0.641    42464       0.639
##  6 l.facebook.com / r…   3661      1712               0.468     7144       0.371
##  7 m.facebook.com / r…  14373     10110               0.703    17631       0.700
##  8 newsletter / email     781       393               0.503     2168       0.405
##  9 zalo / zalo           2782      2400               0.863     4029       0.668
## 10 facebook.com / ref…   1594       694               0.435     2557       0.496
## # ℹ 5 more variables: pages_session <dbl>, avg_session_duration <dbl>,
## #   ecommerce_conversion_rate <dbl>, transactions <dbl>, revenue <dbl>
highlight_sources <- c("google / cpc", "(direct) / (none)")

# Create a new column to highlight only selected sources
top_10_users$highlight <- ifelse(top_10_users$source_medium %in% highlight_sources, top_10_users$source_medium, "Other")

# Create the scatter plot
ggplot(top_10_users, aes(x = users, y = revenue, size = revenue, color = highlight)) +
  geom_point(alpha = 0.7) +  # Add transparency to the points
  scale_size_continuous(range = c(3, 12), guide = "none") +  # Hide size legend
  scale_x_log10() +  # Apply log10 transformation to the x-axis
  scale_y_log10() +  # Apply log10 transformation to the y-axis
  scale_color_manual(values = c("(direct) / (none)" = "green", "google / cpc" = "blue", "Other" = "grey")) +  # Custom colors
  labs(x = "Users (log10)", y = "Revenue (log10)", color = "Source Medium") +
  ggtitle("Scatter Plot Highlighting Two Source Mediums") +
  theme_minimal() +
  theme(legend.position = "right",
    legend.text = element_text(size = 10),  # Increase legend text size
    legend.title = element_text(size = 10))

top_10_users <- top_10_users |> dplyr::filter(top_10_users$source_medium != "google / cpc")
highlight_sources <- c("youtube / social", "(direct) / (none)", "facebook / social")


top_10_users$highlight <- ifelse(top_10_users$source_medium %in% highlight_sources, top_10_users$source_medium, "Other")


ggplot(top_10_users, aes(x = users, y = revenue, size = revenue, color = highlight)) +
  geom_point(alpha = 0.7) +  # Add transparency 
  scale_size_continuous(range = c(3, 12), guide = "none") +  # Hide size legend
  scale_x_log10() +  # Apply log10 transformation 
  scale_y_log10() +  # Apply log10 transformation 
  scale_color_manual(values = c("(direct) / (none)" = "green", "youtube / social" = "red", "facebook / social" = "blue", "Other" = "grey")) +  # Custom colors
  labs(x = "Users (log10)", y = "Revenue (log10)", color = "Source Medium") +
  ggtitle("Scatter Plot Highlighting Three Source Mediums") +
  theme_minimal() +
  theme(legend.position = "right",
    legend.text = element_text(size = 10),  # Increase legend text size
    legend.title = element_text(size = 10))

# Create a new column to highlight only selected sources
top_10_users$highlight <- ifelse(top_10_users$source_medium %in% highlight_sources, top_10_users$source_medium, "Other")


ggplot(top_10_users, aes(x = new_users, y = revenue, size = revenue, color = highlight)) +
  geom_point(alpha = 0.7) +  # Add transparency 
  scale_size_continuous(range = c(3, 12), guide = "none") +  # Hide size legend
  scale_x_log10() +  # Apply log10 transformation 
  scale_y_log10() +  # Apply log10 transformation 
  scale_color_manual(values = c("(direct) / (none)" = "green", "youtube / social" = "red", "facebook / social" = "blue", "Other" = "grey")) +  # Custom colors
  labs(x = "New Users (log10)", y = "Revenue (log10)", color = "Source Medium") +
  ggtitle("Scatter Plot Highlighting Three Source Mediums") +
  theme_minimal() +
  theme(legend.position = "right",
    legend.text = element_text(size = 10),  # Increase legend text size
    legend.title = element_text(size = 10))

top_10_users$color <- ifelse(top_10_users$source_medium %in% highlight_sources, top_10_users$source_medium, "gray")


ggplot(top_10_users, aes(x = reorder(source_medium, percent_of_new_user), y = percent_of_new_user, fill = color)) +
  geom_bar(stat = "identity") +  # Bar plot
  coord_flip() +  # Flip coordinates for better readability
  scale_fill_manual(values = c("(direct) / (none)" = "green", "youtube / social" = "red", "facebook / social" = "blue", "gray" = "gray")) +  # Custom colors
  labs(x = "Source Medium", y = "Percent of new users", title = "Percent of new users by Source Medium") +
  theme_minimal() +
  theme(legend.position = "none")

top_10_users$color <- ifelse(top_10_users$source_medium %in% highlight_sources, top_10_users$source_medium, "gray")


ggplot(top_10_users, aes(x = reorder(source_medium, bounce_rate), y = bounce_rate, fill = color)) +
  geom_bar(stat = "identity") +  # Bar plot
  coord_flip() +  # Flip coordinates for better readability
  scale_fill_manual(values = c("(direct) / (none)" = "green", "youtube / social" = "red", "facebook / social" = "blue", "gray" = "gray")) +  # Custom colors
  labs(x = "Source Medium", y = "Bounce Rate (Lower is better)", title = "Bounce Rate by Source Medium") +
  theme_minimal() +
  theme(legend.position = "none")

top_10_users$color <- ifelse(top_10_users$source_medium %in% highlight_sources, top_10_users$source_medium, "gray")


ggplot(top_10_users, aes(x = reorder(source_medium, ecommerce_conversion_rate), y = ecommerce_conversion_rate, fill = color)) +
  geom_bar(stat = "identity") +  # Bar plot
  coord_flip() +  # Flip coordinates for better readability
  scale_fill_manual(values = c("(direct) / (none)" = "green", "youtube / social" = "red", "facebook / social" = "blue", "gray" = "gray")) +  # Custom colors
  labs(x = "Source Medium", y = "Ecommerce Conversion Rate (Higher is better)", title = "Ecommerce Conversion Rate by Source Medium") +
  theme_minimal() +
  theme(legend.position = "none")

top_10_users$color <- ifelse(top_10_users$source_medium %in% highlight_sources, top_10_users$source_medium, "gray")


ggplot(top_10_users, aes(x = reorder(source_medium, avg_session_duration), y = avg_session_duration, fill = color)) +
  geom_bar(stat = "identity") +  # Bar plot
  coord_flip() +  # Flip coordinates for better readability
  scale_fill_manual(values = c("(direct) / (none)" = "green", "youtube / social" = "red", "facebook / social" = "blue", "gray" = "gray")) +  # Custom colors
  labs(x = "Source Medium", y = "Average session duration (Higher is better)", title = "Average session duration by Source Medium") +
  theme_minimal() +
  theme(legend.position = "none")